Imports¶
In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from tkinter import *
from sklearn.preprocessing import LabelEncoder, StandardScaler
import plotly.express as px
Data Preprocessing¶
In [2]:
df=pd.read_csv("spotify_history.csv")
arrShuffle=df["shuffle"]
arrReason_End=df["reason_end"]
arrMs_Played=df["ms_played"]
arrReason_Start=df["reason_start"]
#converting to 0's and 1's
arrReason_End = (df["reason_end"] != "trackdone").astype(int)
In [3]:
odds=[0,0,0,0]
odds[0]=0
odds[1]=0
odds[2]=0
odds[3]=0
onShuffle_Skip = 0
shuffle = 0
notShuffle_notSkip = 0
notShuffle = 0
notShuffle_Skip = 0
onShuffle_notSkip = 0
for x in range(len(arrShuffle)):
if(arrShuffle[x]==arrShuffle[0] and arrReason_End[x]==0): #if listener was not on shuffle when playing track and they did not finish the song
odds[0]+=1
notShuffle_Skip += 1
elif(arrShuffle[x]==arrShuffle[0] and arrReason_End[x]==1): #if the listener was not on shuffle when playing track and they did finish the song
odds[1]+=1
notShuffle_notSkip += 1
elif(arrShuffle[x]!=arrShuffle[0] and arrReason_End[x]==0): #if the listener was on shuffle while playing the track and they did not finish the song
odds[2]+=1
onShuffle_Skip += 1
else: #if the listener was on shuffle while playing the track and they did finish the song
odds[3]+=1
onShuffle_notSkip += 1
odds[0]=odds[0]/len(arrShuffle) #prob of skipping song if listener is not on shuffle/ all shuffle
odds[1]=odds[1]/len(arrShuffle) #prob of finishing song if listener is not on shuffle/ all shuffle
odds[2]=odds[2]/len(arrShuffle) #prob of skipping song if listener is on shuffle/ all shuffle
odds[3]=odds[3]/len(arrShuffle) #prob of finishing song if listener is on shuffle/ all shuffle
plt.plot(range(len(odds)),odds)
plt.xlabel("Index")
plt.ylabel("Percentage")
plt.title("Odds")
plt.show()
for i in range(len(arrShuffle)):
if arrShuffle[i] != arrShuffle[0]:
shuffle += 1
else:
notShuffle += 1
# P(Skipping a song | We are on shuffle)
p1 = onShuffle_Skip / shuffle
# P(Not skipping | We are not on shuffle)
p2 = notShuffle_notSkip / notShuffle
# P(Skipping a song | We are not on shuffle)
p3 = notShuffle_Skip / notShuffle
# P(Not skipping | We are on shuffle)
p4 = onShuffle_notSkip / shuffle
positions = [0, 1, 2, 3] # positions for bars
plt.bar([x for x in positions], [p1, p2, p3, p4], color=["blue", "red", "pink", "purple"])
plt.xticks(positions, ["Skipping|Shuffle", "Not Skipping|Not Shuffle", "Skipping|Not Shuffle", "Not Skipping|Shuffle"], fontsize=8, rotation=20)
plt.xlabel("Conditions")
plt.ylabel("Probability")
plt.title("Conditional Probabilities")
plt.show()
In [4]:
skippedMS={}
finishedMS={}
for x in range(len(arrMs_Played)):
if(arrReason_End[x]==1):
if(arrMs_Played[x]//31222.5 not in finishedMS):
finishedMS.update({arrMs_Played[x]//31222.5:1})
else:
finishedMS[arrMs_Played[x]//31222.5]+=1
else:
if(arrMs_Played[x]//31222.5 not in skippedMS):
skippedMS.update({arrMs_Played[x]//31222.5:1})
else:
skippedMS[arrMs_Played[x]//31222.5]+=1
arr1=list(skippedMS.keys())
arr1=sorted(arr1)
arr2=list(finishedMS.keys())
arr2=sorted(arr2)
arr3={}
arr4={}
for x in arr1:
if x in arr2:
arr3.update({x:skippedMS[x]/(skippedMS[x]+finishedMS[x])})
else:
arr3.update({x:1})
for x in arr2:
if x in arr1:
arr4.update({x:finishedMS[x]/(skippedMS[x]+finishedMS[x])})
else:
arr4.update({x:1})
plt.plot(list(arr3.keys()),list(arr3.values()))
plt.xlabel("Keys")
plt.ylabel("Percentages")
plt.title("Skipping odds")
plt.show()
plt.plot(list(arr4.keys()),list(arr4.values()))
plt.title("Finishing odds")
plt.show()
3-D Visualizations on Data Set¶
In [5]:
def get_seconds_range(secs):
return float(secs)/1000
df = pd.read_csv("spotify_history.csv")
df["reason_end"] = (df["reason_end"] != "trackdone").astype(int)
df['reason_start']=df['reason_start'].apply(str)
df['shuffle']=df['shuffle'].apply(str)
df['ms_played']=df['ms_played'].apply(int)
df['secs_played'] = df['ms_played'].apply(get_seconds_range)
Xuniques, df["X"] = np.unique(df['reason_start'], return_inverse=True)
Yuniques, df["Y"] = np.unique(df['shuffle'], return_inverse= True)
df["Z"] = np.array(df['ms_played'])
fig = px.scatter_3d(df, x='reason_start', y='shuffle', z='secs_played',
color="reason_end",
hover_data=['X', 'Y', 'Z', 'reason_end'])
fig.show()